Alttext
! [Alt text] (/Users/sophiang/Documents/am.jpeg)/am.jpeg “am”)
library(genius)
# Get songlyrics using Genius lyrics package
arc_mon_am <- genius_album(artist = "Arctic Monkeys", album = "AM")
## Joining, by = c("track_title", "track_n", "track_url")
library(tidyverse)
## ── Attaching packages ───────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ──────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(topicmodels)
tidy_monkey <- arc_mon_am %>%
unnest_tokens(word, lyric) %>%
anti_join(stop_words) %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup() %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Joining, by = "word"
## Joining, by = "word"
## Selecting by n
tidy_monkey
library(textdata)
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,891 more rows
tidy_monkey_2 <- arc_mon_am %>%
unnest_tokens(word, lyric) %>%
anti_join(stop_words) %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup() %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Joining, by = "word"
## Joining, by = "word"
## Selecting by n
tidy_monkey_2
library(tm)
# Create raw corpus from genius lyrics
corpus_raw <- Corpus(VectorSource(arc_mon_am$lyric))
# Transform everything to lowercase
corpus <- tm_map(corpus_raw,content_transformer(tolower))
# Strip whitespace
corpus <- tm_map(corpus, stripWhitespace)
# Remove punctuation
corpus <- tm_map(corpus, removePunctuation)
# Remove stopwords
corpus <- tm_map(corpus, removeWords, stopwords("english"))
# Stem the document
corpus <- tm_map(corpus, stemDocument)
# Create document term matrix
dtm <- DocumentTermMatrix(corpus)
# Tidy dtm
corpus_tidy <- tidy(dtm)
corpus_tidy %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
## # A tibble: 1,574 x 6
## document term count tf idf tf_idf
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 309 bumpin 1 1 6.19 6.19
## 2 350 forev 1 1 5.09 5.09
## 3 15 youv 1 1 4.80 4.80
## 4 34 youv 1 1 4.80 4.80
## 5 411 team 1 1 4.80 4.80
## 6 415 team 1 1 4.80 4.80
## 7 422 team 1 1 4.80 4.80
## 8 426 team 1 1 4.80 4.80
## 9 17 mayb 1 1 4.58 4.58
## 10 182 shoowop 4 1 4.58 4.58
## # … with 1,564 more rows
# Deletes rows with zero entry because each row needs to contain at least one non-zero entry
raw.sum <- apply(dtm, 1, FUN=sum)
dtm <- dtm[raw.sum!=0,]
# LDA
output <- LDA(dtm, k = 3, control = list(seed = 1234))
beta <- tidy(output, matrix = "beta")
filter(beta, topic==1)%>% arrange(desc(beta))
## # A tibble: 560 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 wanna 0.0439
## 2 1 your 0.0233
## 3 1 come 0.0213
## 4 1 get 0.0210
## 5 1 make 0.0209
## 6 1 just 0.0201
## 7 1 like 0.0192
## 8 1 road 0.0183
## 9 1 mine 0.0180
## 10 1 number 0.0161
## # … with 550 more rows
filter(beta, topic==2)%>% arrange(desc(beta))
## # A tibble: 560 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 2 wanna 0.0351
## 2 2 snap 0.0234
## 3 2 feel 0.0181
## 4 2 get 0.0175
## 5 2 come 0.0170
## 6 2 shoowop 0.0161
## 7 2 mine 0.0158
## 8 2 just 0.0155
## 9 2 thought 0.0127
## 10 2 ever 0.0114
## # … with 550 more rows
filter(beta, topic==3)%>% arrange(desc(beta))
## # A tibble: 560 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 3 ooh 0.0463
## 2 3 one 0.0225
## 3 3 come 0.0207
## 4 3 babi 0.0166
## 5 3 night 0.0155
## 6 3 shoowop 0.0154
## 7 3 know 0.0145
## 8 3 oooh 0.0144
## 9 3 there 0.0132
## 10 3 like 0.0128
## # … with 550 more rows
round(head(posterior(output, dtm)$topics), digits = 3)
## 1 2 3
## 1 0.338 0.330 0.332
## 2 0.328 0.344 0.328
## 3 0.333 0.346 0.321
## 4 0.333 0.340 0.327
## 5 0.337 0.331 0.333
## 6 0.321 0.323 0.356
# Use dplyr’s top_n() to find the 10 terms that are most common within each of the 3 topics
monkey_top_terms <- beta %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
# Create ggplot
g_monkey_top_terms <- monkey_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
g_monkey_top_terms
library(wordcloud)
## Loading required package: RColorBrewer
# Wordcloud for three studio albums
monkey_cloud <- wordcloud(corpus, max.words = 70, random.order = FALSE, ordered.clouds = TRUE)